/*    Copyright 2010 Tobias Marschall
 *
 *    This file is part of MoSDi.
 *
 *    MoSDi is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    MoSDi is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with MoSDi.  If not, see <http://www.gnu.org/licenses/>.
 */

package mosdi.util;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;

import mosdi.fa.Alphabet;
import mosdi.fa.CDFA;
import mosdi.fa.DFAFactory;
import mosdi.fa.FiniteMemoryTextModel;
import mosdi.fa.GeneralizedString;
import mosdi.fa.IIDTextModel;
import mosdi.fa.MarkovianTextModel;
import mosdi.paa.ClumpSizeCalculator;
import mosdi.paa.MatchCountDAA;
import mosdi.paa.TextBasedPAA;

public class SequenceUtils {

	/** Creates a long sequence by concatenation of individual sequences separated by -1.
	 *  If reverseComlement is true, for each sequence, its reverse complement will also
	 *  be added: [seq1],-1,[seq1rev],-1,[seq2],-1,[seq2rev],-1, etc.
	 */
	public static int[] concatSequences(List<NamedSequence> l, boolean reverseComplements) {
		int totalLength = 0;
		for (NamedSequence ns : l) {
			totalLength+=(ns.length()+1) * (reverseComplements?2:1);
		}
		int[] result = new int[totalLength];
		int i = 0;
		for (NamedSequence ns : l) {
			System.arraycopy(ns.getSequence(), 0, result, i, ns.length());
			i+=ns.length();
			result[i++]=-1;
			if (reverseComplements) {
				reverseComplementaryCopyTo(ns.getSequence(),result,i);
				i += ns.length();
				result[i++]=-1;
			}
		}
		return result;
	}

	/** Similar to concatSequences(). */
	public static int[] concatAnnotations(List<NamedSequence> l, String trackName, boolean reverseComplements) {
		int totalLength = 0;
		for (NamedSequence ns : l) {
			totalLength+=(ns.length()+1) * (reverseComplements?2:1);
		}
		int[] result = new int[totalLength];
		int i = 0;
		for (NamedSequence ns : l) {
			int[] track = ns.getAnnotationTrack(trackName);
			System.arraycopy(track, 0, result, i, ns.length());
			i+=ns.length();
			result[i++]=-1;
			if (reverseComplements) {
				for (int j=0; j<ns.length(); ++j) {
					result[i++] = track[ns.length()-j-1];
				}
				result[i++]=-1;
			}
		}
		return result;
	}

	/** Assembles sequences and an annotation track into one sequence using a ProductAlphabet. 
	 *  Similar to concatSequences(). 
	 *  @param productAlphabet First component: sequence; second component: annotation track.
	 */
	public static int[] concatAndJoin(List<NamedSequence> l, ProductEncoder productAlphabet, String trackName, boolean reverseComplements) {
		int totalLength = 0;
		for (NamedSequence ns : l) {
			totalLength+=(ns.length()+1) * (reverseComplements?2:1);
		}
		int[] result = new int[totalLength];
		int i = 0;
		for (NamedSequence ns : l) {
			int[] part = productAlphabet.encodeSequence(true, ns.getSequence(), ns.getAnnotationTrack(trackName));
			System.arraycopy(part, 0, result, i, ns.length());
			i+=ns.length();
			result[i++]=-1;
			if (reverseComplements) {
				part = productAlphabet.encodeSequence(true, reverseComplementaryCopy(ns.getSequence()), ArrayUtils.reverseCopy(ns.getAnnotationTrack(trackName)));
				System.arraycopy(part, 0, result, i, ns.length());
				i+=ns.length();
				result[i++]=-1;
			}
		}
		return result;
	}

	/** Given a sequence over the DNA alphabet {A=0,C=1,G=2,T=3}, returns the 
	 *  reverse complementary sequence (leaving the original sequence untouched).
	 *  Characters not in {0,1,2,3} are copied unchanged. 
	 */
	public static int[] reverseComplementaryCopy(int[] sequence) {
		int[] result = new int[sequence.length];
		reverseComplementaryCopyTo(sequence, result, 0);
		return result;
	}

	/** Given a sequence over the DNA alphabet {A=0,C=1,G=2,T=3}, copies the 
	 *  reverse complementary sequence into a target array.
	 *  Characters not in {0,1,2,3} are copied unchanged.
	 *  @param position Gives the start position in array target. 
	 */
	public static void reverseComplementaryCopyTo(int[] sequence, int[] target, int position) {
		for (int j=0; j<sequence.length; ++j) {
			int c = sequence[sequence.length-j-1];
			if ((c>=0) && (c<=3)) {
				// 3 - c = complementary
				target[position++] = 3 - c;
			} else {
				// Leave character that are not ACGT (especially negative ones) untouched.
				target[position++] = c;
			}
		}
	}

	public static List<NamedSequence> readFastaFile(String filename, Alphabet alphabet) throws FileNotFoundException, IOException, InvalidInputFileException {
		return readFastaFile(filename,alphabet,false);
	}

	public static List<NamedSequence> readFastaFile(String filename, Alphabet alphabet, boolean replaceUnknownByMinusOne) throws FileNotFoundException, IOException, InvalidInputFileException {
		ArrayList<NamedSequence> sequences = new ArrayList<NamedSequence>();
		String name = null;
		ComposedSequence sequence = null;
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
		int lineNr = 0;
		while (true) {
			String line = br.readLine();
			lineNr += 1;
			if (line==null) {
				if (name!=null) {
					if (sequence==null) throw new InvalidInputFileException("Unexpected end of FASTA file.");
					sequences.add(new NamedSequence(name, alphabet.buildIndexArray(sequence,false,replaceUnknownByMinusOne)));
				}
				break;
			}
			line = line.trim();
			if (line.equals("")) continue;
			if (line.charAt(0)=='>') {
				if (name!=null) {
					if (sequence==null) throw new InvalidInputFileException("Illegal FASTA file: Offending line: "+lineNr);
					sequences.add(new NamedSequence(name, alphabet.buildIndexArray(sequence,false,replaceUnknownByMinusOne)));
					name = null;
					sequence = null;
				}
				name = line.substring(1);
			} else {
				if (name==null) throw new InvalidInputFileException("Illegal FASTA file: Offending line: "+lineNr);
				if (sequence==null) sequence = new ComposedSequence();
				sequence.append(line.toUpperCase());
			}
		}
		if (sequences.isEmpty()) throw new InvalidInputFileException("FASTA file empty.");
		return sequences;
	}

	public static class MinMax {
		private double min;
		private double max;
		public MinMax(double min, double max) {
			this.min = min;
			this.max = max;
		}
		public double getMin() { return min; }
		public double getMax() { return max; }
	}

	public static MinMax readAnnotationTrack(String trackName, String filename, List<NamedSequence> sequences, int binCount) {
		FileInputStream file = null;
		try {
			file = new FileInputStream(filename);
		} catch (FileNotFoundException e) {
			Log.errorln("Annotation file not found, sorry!");
			System.exit(1);
		}
		double min = Double.POSITIVE_INFINITY;
		double max = Double.NEGATIVE_INFINITY;
		Map<String,ArrayList<Double>> tracks = new HashMap<String,ArrayList<Double>>();
		String name = null;
		BufferedReader br = new BufferedReader(new InputStreamReader(file));
		try {
			while (true) {
				String line = br.readLine();
				if (line==null) break;
				line = line.trim();
				if (line.equals("")) continue;
				if (line.charAt(0)=='>') {
					name = line.substring(1);
				} else {
					if (name==null) {
						Log.errorln("Invalid Fasta-File, sorry!");
						System.exit(1);
					}
					ArrayList<Double> track = new ArrayList<Double>(); 
					StringTokenizer st = new StringTokenizer(line, " ", false);
					while (st.hasMoreTokens()) {
						double d = Double.parseDouble(st.nextToken());
						min = Math.min(min,d);
						max = Math.max(max,d);
						track.add(d);
					}
					tracks.put(name, track);
				}
			}
		} catch (IOException e) {
			Log.errorln("I/O failure while reading annotations, sorry!");
			System.exit(1);
		}
		if (sequences.size()<1) {
			Log.errorln("Couldn't read annotations");
			System.exit(1);
		}
		for (NamedSequence ns : sequences) {
			ArrayList<Double> track = tracks.get(ns.getName());
			if (track!=null) {
				int[] intTrack = new int[track.size()];
				double binWidth = (max-min)/binCount;
				for (int i=0; i<track.size(); ++i) {
					intTrack[i] = Math.min((int)((track.get(i)-min)/binWidth),binCount-1);
				}
				ns.addAnnotationTrack(trackName, intTrack);
			}
		}
		return new MinMax(min,max);
	}

	public static List<String> readSequences(String filename) {
		ArrayList<String> sequences = new ArrayList<String>();
		FileInputStream genomeFile = null;
		try {
			genomeFile = new FileInputStream(filename);
		} catch (FileNotFoundException e) {
			Log.errorln("Sequence file not found, sorry!");
			System.exit(1);
		}
		int seqCount = 0;
		int charCount = 0;
		BufferedReader br = new BufferedReader(new InputStreamReader(genomeFile));
		try {
			while (true) {
				String line = br.readLine();
				if (line==null) break;
				sequences.add(line.toUpperCase());
				seqCount+=1;
				charCount+=line.length();
			}
		} catch (IOException e) {
			Log.errorln("I/O failure while reading sequence, sorry!");
			System.exit(1);
		}
		if (sequences.size()<1) {
			Log.errorln("Couldn't read sequences");
			System.exit(1);
		}
		Log.printf(Log.Level.DEBUG, "Successfully read %,d characters in %,d sequences%n", charCount, seqCount);
		return sequences;
	}

	public static String readSequence(String filename) {
		String sequence = null;
		FileInputStream genomeFile = null;
		try {
			genomeFile = new FileInputStream(filename);
		} catch (FileNotFoundException e) {
			Log.errorln("Sequence file not found, sorry!");
			System.exit(1);
		}
		BufferedReader br = new BufferedReader(new InputStreamReader(genomeFile));
		try {
			String line = br.readLine();
			if (line==null) {
				Log.errorln("Couldn't read sequence");
				System.exit(1);
			}
			sequence=line.toUpperCase();
			Log.printf(Log.Level.DEBUG, "Read sequence: %d characters%n", sequence.length());
			Log.printf(Log.Level.DEBUG, "Sequence: %s%n", sequence);
			while (true) {
				line = br.readLine();
				if (line==null) break;
				if (line.equals("")) continue;
				Log.errorln("Cannot handle multiple sequences, sorry!");
				System.exit(1);
			}
		} catch (IOException e) {
			Log.errorln("I/O failure while reading sequence, sorry!");
			System.exit(1);
		}
		return sequence;
	}

	/** Calculate the distribution of the number of sequences a pattern occurs in, i.e. if dist is 
	 *  returned, then dist[k] is the distribution that the pattern occurs in (exactly) k sequences
	 *  (at least once).
	 *  @param sequenceLengths Array of sequence lengths.
	 *  @param singleExpectation Probability that the pattern occurs at any single position.
	 */
	public static double[] calculateSequenceCountDistribution(int[] sequenceLengths, double singleExpectation, int patternLength, double expectedClumpSize, boolean logarithmic) {
		double[] dist = new double[sequenceLengths.length+1];
		if (logarithmic) {
			Arrays.fill(dist, Double.NEGATIVE_INFINITY);
			dist[0]=0.0;
		} else {
			dist[0]=1.0;
		}
		for (int length : sequenceLengths) {
			if (length<patternLength) continue;
			double expectation = singleExpectation*(length-patternLength+1);
			// expected number of clumps
			double lambda = expectation/expectedClumpSize;
			// chance that pattern does not occur in s
			double p_0, p_1;
			if (logarithmic) {
				p_0 = -lambda;
				p_1 = Math.log1p(-Math.exp(p_0));
			} else {
				p_0 = Math.exp(-lambda);
				p_1 = 1.0 - p_0;
			}
			double [] newDist = new double[sequenceLengths.length+1];
			if (logarithmic) Arrays.fill(dist, Double.NEGATIVE_INFINITY);
			for (int i=0; i<dist.length; ++i) {
				if (logarithmic) {
					newDist[i]=LogSpace.logAdd(newDist[i], dist[i]+p_0);
					if (i+1<dist.length) newDist[i]=LogSpace.logAdd(newDist[i], dist[i]+p_1);
				} else {
					newDist[i]+=dist[i]*p_0;
					if (i+1<dist.length) newDist[i+1]+=dist[i]*p_1;
				}
			}
			dist=newDist;
		}
		return dist;
	}

	/** Calculate the distribution of the number of sequences a pattern occurs in, i.e. if dist is 
	 *  returned, then dist[k] is the distribution that the pattern occurs in (exactly) k sequences
	 *  (at least once).
	 *  @param sequences List of sequences, only the sequences lenghts are used.
	 *  @param singleExpectation Probability that the pattern occurs at any single position.
	 */
	public static double[] calculateSequenceCountDistribution(List<String> sequences, double singleExpectation, int patternLength, double expectedClumpSize, boolean logarithmic) {
		int[] sequenceLengths = new int[sequences.size()];
		int n = 0;
		for (String s : sequences) sequenceLengths[n++] = s.length();	
		return calculateSequenceCountDistribution(sequenceLengths, singleExpectation, patternLength, expectedClumpSize, logarithmic);
	}
	
	/** Returns an encoder/decoder to convert a q-gram into an integer and vice versa.
	 *  Compatible with countQGrams methods. */
	public static ProductEncoder qGramEncoder(int q, int alphabetSize) {
		int[] bases = new int[q];
		Arrays.fill(bases, alphabetSize);
		return new ProductEncoder(false,bases);
	}

	/** Counts the q-grams in the given sequence; q-grams containing the special 
	 *  character -1 are ignored. Result is compatible with qGramEncoder(), i.e.
	 *  result[qGramEncoder().encode(qgram)] gives the number of occurrences of 
	 *  qgram.
	 */
	public static double[] countQGrams(int q, int alphabetSize, int[] sequence) {
		return countQGrams(q,alphabetSize,sequence,0);
	}

	public static double[] countQGrams(int q, int alphabetSize, List<NamedSequence> sequences) {
		return countQGrams(q,alphabetSize,sequences,0);
	}

	public static double[] countQGrams(int q, int alphabetSize, List<NamedSequence> sequences, double pseudoCounts) {
		double[] result = new double[(int)Math.pow(alphabetSize, q)];
		Arrays.fill(result, pseudoCounts);
		for (NamedSequence ns : sequences) {
			countQGrams(q,alphabetSize,ns.getSequence(),result);
		}
		return result;
	}

	/** Similar to countQGrams(int,int,int[]) but adds a given number of pseudocounts to each 
	 *  q-gram count. */
	public static double[] countQGrams(int q, int alphabetSize, int[] sequence, double pseudoCounts) {
		double[] result = new double[(int)Math.pow(alphabetSize, q)];
		Arrays.fill(result, pseudoCounts);
		countQGrams(q,alphabetSize,sequence,result);
		return result;
	}

	/** Similar to countQGrams(int,int,int[]) but the resulting q-gram counts are added to
	 *  a given array of word frequencies.
	 *  
	 *  @param wordFrequencies An array of length alphabetSize^q.
	 */
	public static void countQGrams(int q, int alphabetSize, int[] sequence, double[] wordFrequencies) {
		if (wordFrequencies.length!=(int)Math.pow(alphabetSize, q)) throw new IllegalArgumentException("wordFrequencies.length != alphabetSize^q");
		int wordIndex = 0;
		int h = ((int)Math.pow(alphabetSize, q-1));
		int n = 0;
		for (int i=0; i<sequence.length; ++i) {
			// ignore words that contain the special character -1
			if (sequence[i]==-1) {
				n = 0;
				continue;
			}
			if ((sequence[i]<-1) || (sequence[i]>=alphabetSize)) {
				throw new IllegalArgumentException("Illegal character: "+ sequence[i]);
			}
			wordIndex = (wordIndex%h)*alphabetSize + sequence[i];
			n++;
			if (n>=q) wordFrequencies[wordIndex]+=1;
		}
	}

	/** Reads a file assumed to contain a list of q-gram frequencies over the DNA alphabet.
	 *  The value of q is determined from the file contents. The file must contain one q-gram
	 *  per line.
	 */
	public static FiniteMemoryTextModel buildTextModelFromQGramFile(String filename) throws FileNotFoundException, IOException, InvalidInputFileException {
		return buildTextModelFromQGramFile(filename, Alphabet.getDnaAlphabet());
	}

	/** Reads a file assumed to contain a list of q-gram frequencies over the DNA alphabet.
	 *  The value of q is determined from the file contents. The file must contain one q-gram
	 *  per line.
	 */
	public static FiniteMemoryTextModel buildTextModelFromQGramFile(String filename, Alphabet alphabet) throws FileNotFoundException, IOException, InvalidInputFileException {
		double[] qGramCounts = null;
		ProductEncoder encoder = null;
		BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
		int n = 0;
		int q = -1;
		while (true) {
			String line = br.readLine();
			if (line==null)	break;
			n += 1;
			line = line.trim();
			line = line.replace('\t', ' ');
			if (line.equals("")) throw new InvalidInputFileException("Illegal q-gram file: Offending line: "+n);
			StringTokenizer st = new StringTokenizer(line, " ", false);
			int[] qgram;
			double count;
			try {
				qgram = alphabet.buildIndexArray(st.nextToken());
				count = Double.parseDouble(st.nextToken());
				if (st.hasMoreTokens()) throw new InvalidInputFileException("Illegal q-gram file: Offending line: "+n);
			} catch (Exception e) {
				throw new InvalidInputFileException("Illegal q-gram file: Offending line: "+n+" ("+e.getMessage()+")");
			}
			if (q==-1) {
				q = qgram.length;
				encoder = qGramEncoder(q, alphabet.size());
				qGramCounts = new double[(int)Math.pow(alphabet.size(), q)];
				Arrays.fill(qGramCounts, -1.0);
			}
			int i = encoder.encode(qgram);
			if (qGramCounts[i]!=-1) throw new InvalidInputFileException("Duplicate q-gram in line "+n+".");
			qGramCounts[i] = count;
		}
		if (n!=qGramCounts.length) throw new InvalidInputFileException("Illegal q-gram file: Missing q-grams. Read only "+n+".");
		if (q==1) return new IIDTextModel(alphabet.size(), qGramCounts);
		return new MarkovianTextModel(q-1, alphabet.size(), qGramCounts);
	}

	public static FiniteMemoryTextModel buildTextModelFromNamedSequences(List<NamedSequence> l, int alphabetSize, int modelOrder) {
		List<int[]> sequences = sequenceList(l);
		if (modelOrder<0) throw new IllegalArgumentException("Model order must be >=0.");
		if (modelOrder==0) {
			return new IIDTextModel(alphabetSize, sequences);
		} else {
			return new MarkovianTextModel(modelOrder, alphabetSize, sequences);
		}
	}
	
	/** Extracts a list of sequences from a list of named sequences. */
	public static List<int[]> sequenceList(List<NamedSequence> l) {
		List<int[]> result = new ArrayList<int[]>(l.size());
		for (NamedSequence ns : l) {
			result.add(ns.getSequence());
		}
		return result;
	}
	
	public static int editDistance(int[] sequence1, int[] sequence2) {
		int[][] table = new int[sequence1.length+1][sequence2.length+1];
		// Initialize column 0
		for (int i=0; i<=sequence1.length; ++i) table[i][0]=i;
		// Initialize row 0
		for (int j=0; j<=sequence2.length; ++j) table[0][j]=j;
		// Fill rest of table
		for (int i=1; i<=sequence1.length; ++i) {
			for (int j=1; j<=sequence2.length; ++j) {
				int is_not_equal = sequence1[i-1]!=sequence2[j-1]?1:0;
				table[i][j] = Math.min(table[i-1][j-1]+is_not_equal, Math.min(table[i-1][j]+1, table[i][j-1]+1));
			}
		}
		return table[sequence1.length][sequence2.length];
	}
	
	/** Computes the expected number of occurrences on a text of length patternLength.
 	 * 
	 * @param patternLength MUST be the length of the pattern underlying the CDFA.
	 */
	public static double computeExpectation(CDFA cdfa, FiniteMemoryTextModel textModel, int patternLength) {
		MatchCountDAA daa = new MatchCountDAA(cdfa, cdfa.getMaxOutput());
		TextBasedPAA paa = new TextBasedPAA(daa, textModel);
		double[] dist = paa.computeValueDistribution(patternLength);
		double result = 0.0;
		for (int value=1; value<paa.getValueCount(); ++value) {
			result += value*dist[value];
		}
		return result;
	}

	public static double calcExpectedClumpSize(String iupacPattern, boolean addReverseComplement, double[] charDist) {
		Alphabet dnaAlphabet = Alphabet.getDnaAlphabet();
		List<GeneralizedString> genStringList = null;
		genStringList = new ArrayList<GeneralizedString>(1);
		genStringList.add(Iupac.toGeneralizedString(iupacPattern));
		if (addReverseComplement) genStringList.add(Iupac.toGeneralizedString(Iupac.reverseComplementary(iupacPattern)));
		CDFA cdfa = DFAFactory.build(dnaAlphabet, genStringList, 1000);
		cdfa = cdfa.minimizeHopcroft();
		// calculate clump size distribution
		ClumpSizeCalculator csc = new ClumpSizeCalculator(new IIDTextModel(dnaAlphabet.size(), charDist), cdfa, iupacPattern.length());
		double[] clumpSizeDist = csc.clumpSizeDistribution(20, 1e-30);
		double expectedClumpSize = 0.0;
		for (int i=1; i<clumpSizeDist.length; ++i) {
			expectedClumpSize+=clumpSizeDist[i]*i;
		}
		return expectedClumpSize;
	}

}
